#include <time.h>
#include <mysql/mysql.h>
#include <string.h>
#include "shtml.h"
#include <stdarg.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <signal.h>
#include <errno.h>

#define BUFFER_SIZE 1024
static char host_name[BUFFER_SIZE];
static char user_name[BUFFER_SIZE];
static char password [BUFFER_SIZE];
static char db_name  [BUFFER_SIZE];

static int port_number;
static bool DEBUG = false;

static MYSQL *conn;


int sleep_time = 600;

//Doc file cau hinh
int after_equal(char * c){
	int i=0;
	for(;c[i]!='\0'&&c[i]!='=';i++);
	return ++i;
}
void trim(char * c)
{
    char buf[BUFFER_SIZE];
    char * start,*end;
    strcpy(buf,c);
    start=buf;
    while(isspace(*start)) start++;
    end=start;
    while(!isspace(*end)) end++;
    *end='\0';
    strcpy(c,start);
}
bool read_buf(char * buf,const char * key,char * value)
{
   if (strncmp(buf,key, strlen(key)) == 0)
    {
		strcpy(value, buf + after_equal(buf));
		trim(value);
		return 1;
   }
   return 0;
}
void read_int(char * buf,const char * key,int * value)
{
	char buf2[BUFFER_SIZE];
	if (read_buf(buf,key,buf2))
		sscanf(buf2, "%d", value);

}
void write_log(const char *fmt, ...)
{
	va_list         ap;
	char            buffer[4096];
	char times[20];
    struct tm *sTm;
    time_t now = time (0);
    sTm = gmtime (&now);
    strftime (times, sizeof(times), "%Y-%m-%d %H:%M:%S", sTm);

	sprintf(buffer,"/var/log/baogia.log");
	FILE *fp = fopen(buffer, "a+");
	if (fp==NULL)
    {
		 fprintf(stderr,"openfile error!\n");
		 system("pwd");
	}va_start(ap, fmt);
	vsprintf(buffer, fmt, ap);
	fprintf(fp,"%s \t %s\n",times,buffer);
	//if (DEBUG) printf("%s\n",buffer);
	va_end(ap);
	fclose(fp);

}
void source_log(const char *nguon, const char *slug,int post_id)
{
	char buffer[4096];
	sprintf(buffer,"/var/log/baogia_post.log");
	FILE *fp = fopen(buffer, "a+");
	if (fp==NULL)
    {
		 fprintf(stderr,"openfile error!\n");
	}
	fprintf(fp,"%d@%s#%s\n",post_id,nguon,slug);
	fclose(fp);
}
void post_log(const char *url)
{
	char buffer[4096];
	sprintf(buffer,"/var/log/duyetweb.log");
	FILE *fp = fopen(buffer, "a+");
	if (fp==NULL)
    {
		 fprintf(stderr,"openfile error!\n");
	}
	fprintf(fp,"%s\n",url);
	fclose(fp);
}
//Khoi tao mysql
bool init_mysql_conf()
{
	FILE *fp=NULL;
	char buf[BUFFER_SIZE];
	host_name[0]=0;
	user_name[0]=0;
	password[0]=0;
	db_name[0]=0;
	port_number=3306;

	fp = fopen("/etc/baogia.conf", "r");
	if(fp!=NULL)
    {
        while (fgets(buf, BUFFER_SIZE - 1, fp))
        {
            read_buf(buf,"F_HOST_NAME",host_name);
            read_buf(buf, "F_USER_NAME",user_name);
            read_buf(buf, "F_PASSWORD",password);
            read_buf(buf, "F_DB_NAME",db_name);
            read_int(buf , "F_PORT_NUMBER", &port_number);

        }
        write_log("Load file %s %s %s %s",host_name,user_name,password,db_name);
		return true;
	//	fclose(fp);
    }
    else
    {
        write_log("Cannot open file 'giavang.conf'");
        return false;
    }
}
bool executesql(const char * sql)
{
	if (mysql_real_query(conn,sql,strlen(sql)))
    {
		write_log("Error in sql %s:%s",sql,mysql_error(conn));
		sleep(20);
		conn=NULL;
		return false;
	}
	else
	    return true;
}

bool init_mysql() {
    if(conn==NULL)
    {
		conn=mysql_init(NULL);		// init the database connection
		/* connect the database */
		const char timeout=30;
		mysql_options(conn,MYSQL_OPT_CONNECT_TIMEOUT,&timeout);

		if(!mysql_real_connect(conn,host_name,user_name,password,db_name,port_number,0,0))
        {
			write_log("Error init mysql with host=%s,user=%s,pass=%s,db=%s: %s",host_name,user_name,password,db_name,mysql_error(conn));
			sleep(20);
			return false;
		}
	}
	if (!executesql("set names utf8"))
    {
        return false;
    }

	return true;
}
//Xu li chinh
void post_add_term(int post_id,int term_id)
{
    char sql[100];
    sprintf(sql,"INSERT IGNORE INTO `wp_term_relationships` (`object_id`,`term_taxonomy_id`,`term_order`) VALUES ('%d','%d','0');",post_id,term_id);
    executesql(sql);
}
void post_add_cat_tag(int post_id)
{
    char sql[400];
    sprintf(sql,"INSERT IGNORE INTO `wp_term_relationships` (`object_id`,`term_taxonomy_id`,`term_order`) VALUES ('%d','2','0'),('%d','3','0'),('%d','4','0'),('%d','5','0'),('%d','6','0');",post_id,post_id,post_id,post_id,post_id);
    executesql(sql);
}

void post_blog(const string title, const string slug, const string content,const char * nguon="")
{
    if (title.length()<10) return;
    if (content.length()<300) return;
    //cout<<content.length()<<endl;
    char times[30];
    struct tm *sTm;
    time_t now = time (0);
    sTm = gmtime (&now);
    sTm->tm_hour+=6;
    mktime(sTm);
    strftime (times, sizeof(times), "%Y-%m-%d %H:%M:%S", sTm);
    //insert to wp_posts
    char pdau[]="(`post_author`,`post_date`,`post_date_gmt`,`post_content`,`post_title`,`post_excerpt`,`post_status`,`comment_status`,`ping_status`,`post_password`,`post_name`,`to_ping`,`pinged`,`post_modified`,`post_modified_gmt`,`post_content_filtered`,`post_parent`,`guid`,`menu_order`,`post_type`,`post_mime_type`,`comment_count`)";
    char sql[12000];
    sprintf(sql,"INSERT INTO `wp_posts` %s VALUES ('1','%s','%s','%s','%s','','publish','open','open','','%s','','','%s','%s','','0','http://baogia.info/%s','0','post','','0');",pdau,times,times,content.c_str(),title.c_str(),slug.c_str(),times,times,slug.c_str());
    //cout<<strlen(sql)<<endl;
    executesql(sql);
    //add post to category & tags ->postmeta
    int pid = mysql_insert_id(conn);
    post_add_cat_tag(pid);
    //source log -> avoid repeat post
    source_log(nguon,slug.c_str(),pid);
    cout<<"POST "<<pid<<". "<<slug<<endl;
    sprintf(sql,"http://baogia.info/%d/%d/%d/%s",sTm->tm_year+1900,sTm->tm_mon+1,sTm->tm_mday,slug.c_str());
    post_log(sql);
}
bool slug_logged(const char *slug)
{
    char cmd[300];
    FILE *in;
    char buff[10240];
    sprintf(cmd,"grep -c %s /var/log/baogia_post.log",slug);

    try
    {
        in=popen(cmd,"r");
    }
    catch (...)
    {
        return true;
    }
    if(!in)
    {
        return true;
    }
    while(fgets(buff, sizeof(buff), in)!=NULL)
        {
            break;
        }
    if (ferror (in)) return true;
    pclose(in);
    return (buff[0]>'0');
}
bool vnexpress_detail(const string url)
{
    shtml sh,tr;
    string title, slug, content;
    sh.setContent(url);
    while (sh.count("/")>0) sh.deleteTo("/");
    sh.replace(".html","");
    sh.replace(":","");
    slug=sh.getContent();

    if (slug_logged(slug.c_str())) return false;
    if (!sh.loadFromURL(url)) return true;
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.removeTagByName("div");
    sh.retainTagByName("div");
    //cout<<sh.getContent()<<endl;
    sh.retainTagByName("div");
    tr=sh.cutTagByName("div");
    while (!tr.isEmpty() && !tr.containAttr("col_left"))
        tr=sh.cutTagByName("div");
    sh=tr;
    //sh.removeTagByName("div",3);
    //sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div");

    sh.removeTagByName("div");

    tr=sh.cutTagByName("div");
    tr.removeAllTags();
    tr.replace("'","",-1); //xoa dau '
    title=tr.getContent();

    tr=sh.cutTagByName("div");
    content=string("<strong>")+tr.getContent()+string("</strong><br/>");

    sh.removeTagByName("div");
    sh.retainTagByName("div");
    sh.removeTagByName("script",-1); //xoa cac the script
    sh.replace("'","",-1); //xoa dau '
    content+=sh.getContent()+"<br/><span class=\"source_url\">Source: v n e x p r e s s . n e t</span>";
    //content+="<span class=\"orgirin_source_url\">Source: "+url+"</span>";
    content+="<div id=\"baogiainfo\"> <a href=\"http://baogia.info\">Báo giá vàng</a> | <a href=\"http://baogia.info\">http://baogia.info</a></div>";
    content+="<div id=\"xemthem\">Xem thêm: <ul>";
    content+="<li><a href=\"http://baogia.info/bao-gia-vang/\">Giá vàng hiện tại</a></li>";
    content+="<li><a href=\"http://baogia.info/bien-dong/\">Bi&#7871;n &#273;&#7897;ng giá vàng trong ngày</a></li>";
    content+="<li><a href=\"http://baogia.info/dang-ki/\">Nh&#7853;n báo giá vàng th&#432;&#7901;ng xuy&#234;n</a></li>";
    content+="</ul></div>";
    if (!DEBUG)
        post_blog(title,slug,content,"vnexpress.net");
    else
    {
        cout<<"Get from vnexpress.net "<<url<<endl;
        cout<<"-Title: "<<title<<endl;
        cout<<"-slug: "<<slug<<endl;
        cout<<"-Content: "<<content<<endl;
    }

    return true;
}

void vnexpress_cat()
{
    shtml sh,tr,td,a;
    string http;
    bool bRet;
    if (!sh.loadFromURL("http://kinhdoanh.vnexpress.net/")) return;
    cout<<"Lay tin tu vnexpress.net"<<endl;
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div",2);
    //cout<<sh.getContent()<<endl;
    sh.retainTagByName("div");
    sh.retainTagByName("div",2);
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    tr=sh.cutTagByName("div");
    tr.retainTagByName("div",2);
    tr.retainTagByName("a");
    if (tr.containAttr("vang"))
    {
        td.setContent(tr.getAttr());
        http=td.getBetween("\"","\"");
        bRet=vnexpress_detail(http);
        if (!bRet) return;
    }

    sh.retainTagByName("div");
    tr=sh.cutTagByName("li");
    while (!tr.isEmpty())
    {
        td=tr.cutTagByName("a");
        if (td.containAttr("vang-")||td.containAttr("-vang"))
        {
            a.setContent(td.getAttr());
            http=a.getBetween("\"","\"");
            bRet=vnexpress_detail(http);
            if (!bRet) return;
        }
        tr=sh.cutTagByName("li");
    }
}
bool ndhmoney_detail(const string url)
{
    shtml sh,tr;
    string title, slug, content;
    sh.setContent(url);
    while (sh.count("/")>0) sh.deleteTo("/");
    //sh.replace(".html","");
    sh.replace(":","");
    slug=sh.getContent();

    if (slug_logged(slug.c_str())) return false;
    if (!sh.loadFromURL(url)) return true;
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div",2);
    //cout<<sh.getContent()<<endl;
    sh.retainTagByName("div",2);
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div",2);
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div",2);
    sh.retainTagByName("div");
    //sh.removeTagByName("div");
    //cout<<slug<<endl;
    //cout<<sh.getContent();
    //return true;
    tr=sh.cutTagByName("h1");
    tr.replace("'","",-1);
    title=tr.getContent();

    tr=sh.cutTagByName("p");
    tr.replace("(NDHMoney)","");
    tr.removeTagByName("script",-1);
    tr.replace("'","",-1); //xoa dau '
    content=string("<strong>")+tr.getContent()+string("</strong><br/>");

    tr=sh.cutTagByName("div");
    if (tr.containAttr("bai_cung_chu_de"))
    {
        tr=sh.cutTagByName("div");
    }
    tr.removeTagByName("script",-1); //xoa cac the script
    tr.replace("'","",-1); //xoa dau '
    tr.replace("NDHMoney","",-1); //xoa dau '
    content+=tr.getContent()+"<br/><span class=\"source_url\">Source: n d h m o n e y . v n</span>";
    //content+="<span class=\"orgirin_source_url\">Source: "+url+"</span>";
    content+="<div id=\"baogiainfo\"> <a href=\"http://baogia.info\">Báo giá vàng</a> | <a href=\"http://baogia.info\">http://baogia.info</a></div>";
    content+="<div id=\"xemthem\">Xem thêm: <ul>";
    content+="<li><a href=\"http://baogia.info/bao-gia-vang/\">Giá vàng hiện tại</a></li>";
    content+="<li><a href=\"http://baogia.info/bien-dong/\">Bi&#7871;n &#273;&#7897;ng giá vàng trong ngày</a></li>";
    content+="<li><a href=\"http://baogia.info/dang-ki/\">Nh&#7853;n báo giá vàng th&#432;&#7901;ng xuy&#234;n</a></li>";
    content+="</ul></div>";
    if (!DEBUG)
        post_blog(title,slug,content,"ndhmoney.vn");
    else
    {
        cout<<"Get from ndhmoney.vn: "<<url<<endl;
        cout<<"-Title: "<<title<<endl;
        cout<<"-slug: "<<slug<<endl;
        cout<<"-Content: "<<content<<endl;
    }
    return true;
}

void ndhmoney_cat()
{
    shtml sh,tr,td;
    string http;
    bool bRet;
    if (!sh.loadFromURL("http://ndhmoney.vn/web/guest/dau-tu/hang-hoa")) return;
    cout<<"Lay tin tu ndhmoney.vn"<<endl;
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div",2);
    //cout<<sh.getContent()<<endl;
    sh.retainTagByName("div",2);
    sh.retainTagByName("div");
    sh.retainTagByName("div",2);
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div",2);
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    //cout<<sh.getContent();

    tr=sh.cutTagByName("div");
    while (!tr.isEmpty())
    {
        tr.retainTagByName("h3");
        tr.retainTagByName("a");
        if (tr.containAttr("vang-")||tr.containAttr("-vang"))
        {
            td.setContent(tr.getAttr());
            http=td.getBetween("\"","\"");
            bRet=ndhmoney_detail(http);
            if (!bRet) return;
        }
        tr=sh.cutTagByName("div");
    }
}
bool vneconomy_detail(const string url)
{
    shtml sh,tr,td;
    string title, slug, content;
    sh.setContent(url);
    while (sh.count("/")>0) sh.deleteTo("/");
    sh.replace(".htm","");
    sh.replace(":","");
    slug=sh.getContent();
    if (slug_logged(slug.c_str())) return false;
    if (!sh.loadFromURL(url)) return true;
    sh.retainTagByName("div",5);
    sh.retainTagByName("div");
    sh.retainTagByName("div",3);
    sh.retainTagByName("div");
    //sh.retainTagByName("div");
    //sh.removeTagByName("div");
    //cout<<slug<<endl;
    //cout<<sh.getContent();
    //return true;
    sh.removeTagByName("div");
    tr=sh.cutTagByName("div");
    td=tr.cutTagByName("h1");
    td.replace("'","",-1);
    title=td.getContent();

    td=tr.cutTagByName("h2");

    //tr.removeTagByName("script",-1);
    td.removeAllTags();
    td.replace("'","",-1); //xoa dau '
    content=string("<strong>")+td.getContent()+string("</strong><br/>");

    tr=sh.cutTagByName("div");
    tr.removeTagByName("div");
    tr.retainTagByName("div");
    tr.removeTagByName("span");
    tr.removeTagByName("div");
    tr.removeTagByName("div");
    tr.retainTagByName("div");
    //cout
    tr.removeTagByName("div");//xoa logo
    tr.removeTagByName("script",-1); //xoa cac the script
    tr.replace("'","",-1); //xoa dau '
    content+=tr.getContent()+"<br/><span class=\"source_url\">Source: v n e c o n o m y . v n</span>";
    //content+="<span class=\"orgirin_source_url\">Source: "+url+"</span>";
    content+="<div id=\"baogiainfo\"> <a href=\"http://baogia.info\">Báo giá vàng</a> | <a href=\"http://baogia.info\">http://baogia.info</a></div>";
    content+="<div id=\"xemthem\">Xem thêm: <ul>";
    content+="<li><a href=\"http://baogia.info/bao-gia-vang/\">Giá vàng hiện tại</a></li>";
    content+="<li><a href=\"http://baogia.info/bien-dong/\">Bi&#7871;n &#273;&#7897;ng giá vàng trong ngày</a></li>";
    content+="<li><a href=\"http://baogia.info/dang-ki/\">Nh&#7853;n báo giá vàng th&#432;&#7901;ng xuy&#234;n</a></li>";
    content+="</ul></div>";
    if (!DEBUG)
        post_blog(title,slug,content,"vneconomy.vn");
    else
    {
        cout<<"Get from vneconomy.vn: "<<url<<endl;
        cout<<"-Title: "<<title<<endl;
        cout<<"-slug: "<<slug<<endl;
        cout<<"-Content: "<<content<<endl;
    }
    return true;
}

void vneconomy_cat()
{
    shtml sh,tr,td;
    string http;
    bool bRet;
    if (!sh.loadFromURL("http://vneconomy.vn/p0c6/tai-chinh.htm")) return;
    cout<<"Lay tin tu vneconomy.vn"<<endl;
    sh.retainTagByName("div",5);
    sh.retainTagByName("div");
    sh.retainTagByName("div",3);
    //cout<<sh.getContent()<<endl;
    sh.retainTagByName("div");
    sh.retainTagByName("div",2);

    tr=sh.cutTagByName("div");
    tr.retainTagByName("a");
    if (tr.containAttr("vang-")||tr.containAttr("-vang"))
    {
        td.setContent(tr.getAttr());
        http=td.getBetween("href=\"","\"");
        http="http://vneconomy.vn"+http;
        bRet=vneconomy_detail(http);
        if (!bRet) return;
    }
    //cout<<sh.getContent();
    sh.retainTagByName("ul");
    tr=sh.cutTagByName("li");
    while (!tr.isEmpty())
    {
        tr.retainTagByName("a");
        if (tr.containAttr("vang-")||tr.containAttr("-vang"))
        {
            td.setContent(tr.getAttr());
            http=td.getBetween("href=\"","\"");
            http="http://vneconomy.vn"+http;
            bRet=vneconomy_detail(http);
            if (!bRet) return;
        }
        tr=sh.cutTagByName("li");
    }
}
bool giavang_detail(const string url)
{
    shtml sh,tr,td;
    string title, slug, content;
    sh.setContent(url);
    while (sh.count("/")>1) sh.deleteTo("/");
    sh.replace("/","");
    sh.replace(":","");
    slug=sh.getContent();
    if (slug_logged(slug.c_str())) return false;
    if (!sh.loadFromURL(url)) return true;
    sh.retainTagByName("div");
    sh.removeTagByName("div",2);
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.removeTagByName("div",2);
    sh.retainTagByName("div");
    sh.retainTagByName("div");

    sh.removeTagByName("div");

    td=sh.cutTagByName("h1");
    td.replace("'","",-1);
    title=td.getContent();

    sh.removeTagByName("div",3);
    tr=sh.cutTagByName("div");

    tr.removeTagByName("script",-1); //xoa cac the script
    tr.replace("'","",-1); //xoa dau '
    tr.replace("Giavang.net","GVN");
    //content+=tr.getContent()+"<br/>Source: g i a v a n g . n e t";
    content+=tr.getContent()+"<br/><span class=\"source_url\">Source: g i a v a n g . n e t</span>";
    //content+="<span class=\"orgirin_source_url\">Source: "+url+"</span>";
    content+="<div id=\"baogiainfo\"> <a href=\"http://baogia.info\">Báo giá vàng</a> | <a href=\"http://baogia.info\">http://baogia.info</a></div>";
    content+="<div id=\"xemthem\">Xem thêm: <ul>";
    content+="<li><a href=\"http://baogia.info/bao-gia-vang/\">Giá vàng hiện tại</a></li>";
    content+="<li><a href=\"http://baogia.info/bien-dong/\">Bi&#7871;n &#273;&#7897;ng giá vàng trong ngày</a></li>";
    content+="<li><a href=\"http://baogia.info/dang-ki/\">Nh&#7853;n báo giá vàng th&#432;&#7901;ng xuy&#234;n</a></li>";
    content+="</ul></div>";
    if (!DEBUG)
        post_blog(title,slug,content,"giavang.net");
    else
    {
        cout<<"Get from giavang.net: "<<url<<endl;
        cout<<"-Title: "<<title<<endl;
        cout<<"-slug: "<<slug<<endl;
        cout<<"-Content: "<<content<<endl;
    }
    return true;
}

void giavang_cat()
{
    shtml sh,tr,td;
    string http;
    bool bRet;
    if (!sh.loadFromURL("http://www.giavang.net/category/tin-moi-nhat/")) return;
    cout<<"Lay tin tu giavang.net"<<endl;
    sh.retainTagByName("div");
    sh.removeTagByName("div",2);
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.removeTagByName("div",2);
    sh.retainTagByName("div");
    sh.retainTagByName("div");
    sh.removeTagByName("div");
    sh.retainTagByName("div");
    sh.retainTagByName("ul");

    tr=sh.cutTagByName("li");
    while (!tr.isEmpty())
    {
        tr.retainTagByName("a");
        if (tr.containAttr("vang-")||tr.containAttr("-vang"))
        {
            td.setContent(tr.getAttr());
            http=td.getBetween("href=\"","\"");
            bRet=giavang_detail(http);
            if (!bRet) return;
            //cout<<http<<endl;
            //break;
        }
        tr=sh.cutTagByName("li");
    }
}
int main(int argc, char** argv)
{
    if (argc>1) DEBUG=true;
    init_mysql_conf();
    init_mysql();

    vnexpress_cat();
    ndhmoney_cat();
    vneconomy_cat();
    giavang_cat();
    return 0;
}
